#!/usr/bin/perl

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

use warnings;
use strict;
use File::Spec;
use Getopt::Long;
use LWP::UserAgent;

my ($volume, $directory, $script_name) = File::Spec->splitpath($0);

my $version = '';
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
    print STDERR "Usage: $script_name -v <version>\n";
    print STDERR "\tversion must be of the form X.Y, e.g. 11.0\n"
        if ($version);
    exit 1;
}
my $url = "http://www.unicode.org/Public/emoji/${version}/emoji-test.txt";
my $underscore_version = $version;
$underscore_version =~ s/\./_/g;
my $class_name = "EmojiTokenizationTestUnicode_${underscore_version}";
my $output_filename = "${class_name}.java";
my $header =<<"__HEADER__";
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.standard;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.junit.Ignore;

/**
 * This class was automatically generated by ${script_name}
 * from: ${url}
 *
 * emoji-test.txt contains emoji char sequences, which are represented as
 * tokenization tests in this class.
 * 
 */
\@Ignore
public class ${class_name} extends BaseTokenStreamTestCase {

  public void test(Analyzer analyzer) throws Exception {
    for (int i = 0 ; i < tests.length ; i += 2) {
      String test = tests[i + 1];
      try {
        assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
      } catch (Throwable t) {
        throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t);        
      }
    }
  }

  private String[] tests = new String[] {
__HEADER__

my @tests = split /\r?\n/, get_URL_content($url);

my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
open OUT, ">$output_path"
    || die "Error opening '$output_path' for writing: $!";

print STDERR "Writing '$output_path'...";

print OUT $header;

my $isFirst = 1;
for my $line (@tests) {
    next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines

    print OUT ",\n\n" unless $isFirst;
    $isFirst = 0;

    # Example line: 1F46E 1F3FB 200D 2642 FE0F                 ; fully-qualified     # 👮🏻‍♂️ man police officer: light skin tone
    $line =~ s/\s+$//;     # Trim trailing whitespace
    $line =~ s/\t/  /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
    print OUT "    \"$line\",\n";
    my ($test_string) = $line =~ /^(.*?)\s*;/;
    $test_string =~ s/([0-9A-F]+)/\\u$1/g;
    $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
    $test_string =~ s/\s//g;
    print OUT "    \"${test_string}\"";
}
print OUT "  };\n}\n";
close OUT;
print STDERR "done.\n";


# sub above_BMP_char_to_surrogates
#
# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
# to the corresponding UTF-16 surrogate pair
#
# Assumption: input string is a sequence more than four hex digits
#
sub above_BMP_char_to_surrogates {
    my $ch = hex(shift);
    my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
    my $low_surrogate  = 0xDC00 + ($ch & 0x3FF);
    return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
}


# sub get_URL_content
#
# Retrieves and returns the content of the given URL.
#
sub get_URL_content {
    my $url = shift;
    print STDERR "Retrieving '$url'...";
    my $user_agent = LWP::UserAgent->new;
    my $request = HTTP::Request->new(GET => $url);
    my $response = $user_agent->request($request);
    unless ($response->is_success) {
        print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
        exit 1;
    }
    print STDERR "done.\n";
    return $response->content;
}
